#!/usr/bin/env python
# encoding: utf-8
"""
#-------------------------------------------------------------------#
#                   CONFIDENTIAL --- CUSTOM STUDIOS                 #     
#-------------------------------------------------------------------#
#                                                                   #
#                   @Project Name : Globallawonline                #
#                                                                   #
#                   @File Name    : vnm_judgov.py                      #
#                                                                   #
#                   @Programmer   : 李建                            #
#                                                                   #  
#                   @Start Date   : 2021/3/9 9:55                 #
#                                                                   #
#                   @Last Update  : 2021/3/9 9:55                 #
#                                                                   #
#-------------------------------------------------------------------#
# Classes:越南判决书库的采集蜘蛛                                                          #
#                                                                   #
#-------------------------------------------------------------------#
"""
import hashlib
import time

import scrapy

from ..DateProcessFunction.datefunction import Timefunction
from ..items import MyFileItem
import re


class Spider (scrapy.Spider):
    name = 'vnm_judgov'
    allowed_domains = ['thuvienphapluat.vn']
    # 越南判例数据库地址
    start_urls = ['https://thuvienphapluat.vn/banan/tim-ban-an?q=&AgentId=0&CityId=65&DistrictId=517&StartPublicDate2=&EndPublishDate2=&type_q=0&AnLeType=&sortType=1&LanguageCode=']
    id = 0

    # 解析初始页面
    def parse(self, response):
        # 获得个年份成员列表
        trs = response.xpath('//*[@id="prs"]/div')
        for tr in trs:
            # 详细网址
            detailUrlS = tr.xpath('./div/div[1]/span/a/@href')
            if len(detailUrlS):
                detailUrla = tr.xpath('./div/div[1]/span/a')
                yield response.follow(detailUrla[0], callback=self.detailed, dont_filter=True)
        nexta = response.css('#page-wrapper div.row div.col-md-9.pr-4 div.mt-2 ul li:last-child a')
        yield response.follow(nexta[0], callback=self.parse)

    # 解析详情页
    def detailed(self, response):
        item = MyFileItem()
        if response.status == 200:
            def iadd():
                self.id += 1
                return self.id
            Isright = True
            # 法律详细网址
            nowurl = response.url
            detailUrl = nowurl
            # 全文
            detailS = response.xpath('//*[@id="home"]/div/div[1]')
            detail = ''
            if len(detailS):
                detail = detailS.get().strip()
            # 案例标题
            titleS = response.xpath('//*[@id="profile"]/div/ul/li[1]/div/div[2]//text()')
            title = ''
            if len(titleS):
                for titleSl in titleS:
                    title = title + ' ' + titleSl.get().strip()
                title = re.sub(r'''^ *''', r'''''', title)  # 去除开头的空格
            # 审理机关
            judgAgencyS = response.xpath('//*[@id="profile"]/div/ul/li[2]/div/div[2]//text()')
            judgAgency = ''
            if len(judgAgencyS):
                for judgAgencySl in judgAgencyS:
                    judgAgency = judgAgency + ' ' + judgAgencySl.get().strip()
                judgAgency = re.sub(r'''^ *''', r'''''', judgAgency)  # 去除开头的空格
            # 案例编号
            caseNumberS = response.xpath('//*[@id="profile"]/div/ul/li[3]/div/div[2]//text()')
            caseNumber = ''
            if len(caseNumberS):
                for caseNumberSl in caseNumberS:
                    caseNumber = caseNumber + ' ' + caseNumberSl.get().strip()
                caseNumber = re.sub(r'''^ *''', r'''''', caseNumber)  # 去除开头的空格
            # 主题分类
            sortCS = response.xpath('//*[@id="profile"]/div/ul/li[5]/div/div[2]//text()')
            sortC = ''
            if len(sortCS):
                sortC = sortCS.get().strip()
            # 裁决日期
            cDatelS = response.xpath('//*[@id="profile"]/div/ul/li[6]/div/div[2]//text()')
            cDate = ''
            if len(cDatelS):
                cDatel = cDatelS.get().strip()
                cDate = Timefunction().strpdatetime(cDatel)
                if cDate.tm_year < 2013:
                    Isright = False
                cDate = time.strftime("%Y-%m-%d", cDate)
            # 关键字
            keyWordS = response.xpath('//*[@id="profile"]/div/ul/li[7]/div/div[2]//text()')
            keyWord = ''
            if len(keyWordS):
                for keyWordSl in keyWordS:
                    keyWord = keyWord + keyWordSl.get().strip()

            country = 'Vietnam'
            website = 'judiciary'
            modular = 'government'
            # 下载文件格式
            ext = 'pdf'

            # 下载文件名
            fina = ''
            if len(detailUrl) > 0:
                fina = 'f' + str(hashlib.md5(detailUrl.encode('utf-8')).hexdigest())

            # 唯一ID
            systemid = str(hashlib.md5(detailUrl.encode('utf-8')).hexdigest())
            if Isright:
                item['file_urls'] = ''
                item['country'] = country
                item['website'] = website
                item['modular'] = modular
                item['ext'] = ext
                item['fina'] = fina
                item['title'] = ''
                item['abstractUrl'] = ''
                item['abstract'] = ''
                item['dabstractUrl'] = ''
                item['detail'] = detail
                item['detailUrl'] = ''
                item['downloadUrl'] = ''

                item['Title'] = title
                item['CaseNumber'] = caseNumber
                item['KeyWord'] = keyWord
                item['SortA'] = 'LAWCOUNTRYYN'
                item['People'] = ''
                item['CaseOfAction'] = ''
                item['UseLaw'] = ''
                item['AdjudicationDate'] = cDate
                item['FullText'] = ''
                item['JudgAgency'] = judgAgency
                item['SortB'] = 'LANGUAGEYNY'
                item['SortC'] = sortC
                item['CaseSummary'] = ''
                item['Articles'] = ''
                item['Chapter'] = ''
                item['Section'] = ''
                item['SYS_FLD_DIGITFILENAME'] = fina
                item['FileUrl'] = ''
                item['AbstractFileName'] = ''
                item['DownLoadUrl'] = detailUrl
                item['DownLoadWebNameC'] = '越南判例数据库'
                item['DownLoadWebNameE'] = "Version Library"
                item['SYSID'] = systemid
                item['Website'] = 'Version Library'
                item['Isconversion'] = '1'
                item['CaseDate'] = ''

                yield item




